Edit distance is defined as dissimilarity measure between strings by counting the minimum number of operations required to transform one string to another. Insertion, Deletion and Substitution of the characters are permitted. Hamming distance works on same length strings and only allows Substitutions.
In [8]:
def hamming_dist(s1, s2):
if len(s1) < len(s2):
s1, s2, = s2, s1
dist = 0
for i in range(len(s1)):
if s1[i] != s2[i]:
dist+=1
return dist
hamming_dist('TGCATAT','ATCCGAT')
Out[8]:
In [11]:
def normalize_string(text):
import string
text = text.lower()
text = text.translate(str.maketrans('','',
string.punctuation+
string.whitespace))
return text
def edit_distance(str1, str2):
'''
This function implements the Wagner-Fischer algorithm
to compute the edit distance between two strings
>>> edit_distance('bear', 'bar')
1
>>> edit_distance('"computer"', 'comm"uter')
1
>>> edit_distance('ea t', 'feat')
1
>>> edit_distance('bee!', 'beer')
1
>>> edit_distance('science', 'seance')
3
>>> edit_distance('laboratory', 'lobotomy')
4
'''
str1 = normalize_string(str1)
str2 = normalize_string(str2)
lenx = len(str1)+1
leny = len(str2)+1
d = [[0 for i in range(leny)] for i in range(lenx)]
for i in range(leny):
d[0][i] = i
for j in range(lenx):
d[j][0] = j
for j in range(1,leny):
for i in range(1,lenx):
if str1[i-1] == str2[j-1]:
d[i][j] = d[i-1][j-1]
else:
d[i][j] = min(d[i-1][j] +1,
d[i][j-1] +1,
d[i-1][j-1] +1)
return d[lenx-1][leny-1]
if __name__ == '__main__':
import doctest
doctest.testmod()
In [12]:
edit_distance('TGCATAT','ATCCGAT')
Out[12]:
Two different strings have an infinite number of possible edit distances. The minimum edit distance in our example is 4.
This question illustrates the Longest Common Subsequence (LCS)algorithm along with Backtracking. Consider again the subsequences TGCATAT and ATCCGAT.
In [27]:
sq1 = "TGCATAT"
sq2 = "ATCCGAT"
sq3 = ""
def LCS(seq1, seq2, penality = 0):
from math import inf
xaxis = len(seq1)+1
yaxis = len(seq2)+1
#initialize matrix
mat = [[-inf] * xaxis for i in range(yaxis)]
track = [[""] * xaxis for i in range(yaxis)]
for i in range(yaxis):
mat[i][0] = penality*i
for i in range(xaxis):
mat[0][i] = penality*i
for x in range(1,yaxis):
for y in range(1,xaxis):
if seq1[y-1] == seq2[x-1]:
mat[x][y] = mat[x-1][y-1] + 1
track[x][y] = "diag"
else:
mat[x][y] = max(mat[x][y-1],
mat[x-1][y])
if mat[x][y] == mat[x][y-1]:
track[x][y] = "left"
else:
track[x][y] = "up"
for i in mat:
print(i)
print()
for i in track:
print(i)
return backtrack(mat, track, seq1, seq2)
def backtrack(m, t, s1, s2):
pos = [len(s1),len(s2)]
algn1 = ""
algn2 = ""
while t[pos[1]][pos[0]] != "":
if t[pos[1]][pos[0]] == "diag":
algn1 = s1[pos[0]-1] + algn1
algn2 = s2[pos[1]-1] + algn2
pos = [pos[0]-1, pos[1]-1]
elif t[pos[1]][pos[0]] == "left":
algn2 = "-" + algn2
algn1 = s1[pos[0]-1] + algn1
pos[0] -= 1
elif t[pos[1]][pos[0]] == "up":
algn2 = s2[pos[1]-1] + algn2
algn1 = "-" + algn1
pos[1] -= 1
if s1[0] != s2[0]:
while pos[0] != 0:
algn2 = "-" + algn2
algn1 = s1[pos[0]-1] + algn1
pos[0] -= 1
while pos[1] != 0:
algn2 = s2[pos[1]-1] + algn2
algn1 = "-" + algn1
pos[1] -= 1
print("\nOriginal sequnces:")
print(s1)
print(s2)
print("Alignment:")
print(algn1)
print(algn2)
LCS(sq1,sq2)
This question illustrates the modified LCS algorithm for Global and Local alignments. Consider the subsequences TATATCGTTAG and AATCTGAT . Assuming a match score δ(u, u) of +4 , mismatch penalty δ(u, v) of -2 and gap penalty σ of -1.
In [33]:
def globalAlignment(sequence1, sequence2):
rows = len(sequence1)+1
columns = len(sequence2)+1
scoringMatrix = [[0] * columns for _ in range(rows)]
directionMatrix = [["empty"] * columns for _ in range(rows)]
for i in range(1, rows):
for j in range(1, columns):
option1 = scoringMatrix[i-1][j] -1
option2 = scoringMatrix[i][j-1] -1
if sequence1[i-1] == sequence2[j-1]:
option3 = scoringMatrix[i-1][j-1] + 4
else:
option3 = scoringMatrix[i-1][j-1] - 2
scoringMatrix[i][j] = max(option1, option2, option3)
if scoringMatrix[i][j] == option3:
directionMatrix[i][j] = "diagonal"
elif scoringMatrix[i][j] == option2:
directionMatrix[i][j] = "left"
else:
directionMatrix[i][j] = "top"
return (scoringMatrix, directionMatrix)
sequence1 = "TATATCGTTAG"
sequence2 = "AATCTGAT"
global_result = globalAlignment(sequence1, sequence2)
global_result[0]
Out[33]:
In [32]:
def localAlignment(sequence1, sequence2):
rows = len(sequence1)+1
columns = len(sequence2)+1
scoringMatrix = [[0] * columns for _ in range(rows)]
directionMatrix = [["empty"] * columns for _ in range(rows)]
for i in range(1, rows):
for j in range(1, columns):
option1 = scoringMatrix[i-1][j] -1
option2 = scoringMatrix[i][j-1] -1
if sequence1[i-1] == sequence2[j-1]:
option3 = scoringMatrix[i-1][j-1] + 4
else:
option3 = scoringMatrix[i-1][j-1] - 2
value = max(option1, option2, option3)
if value > 0:
scoringMatrix[i][j] = value
if scoringMatrix[i][j] == option3:
directionMatrix[i][j] = "diagonal"
elif scoringMatrix[i][j] == option2:
directionMatrix[i][j] = "left"
else:
directionMatrix[i][j] = "top"
else:
scoringMatrix[i][j] = 0
return (scoringMatrix, directionMatrix)
sequence1 = "TATATCGTTAG"
sequence2 = "AATCTGAT"
local_result = localAlignment(sequence1, sequence2)
local_result[0]
Out[32]: